Importing Packages¶

In [1]:
# dataframe package
import pandas as pd
import numpy as np

# plotting packages
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

# packages for creating the map
import holoviews as hv
from datashader.utils import lnglat_to_meters
import colorcet as cc
hv.extension('bokeh')

# ML packages
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split, GridSearchCV

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, explained_variance_score, r2_score

# xgboost
from xgboost import XGBRegressor

EDA¶

In [2]:
housing = pd.read_csv("housing.csv")
In [3]:
housing.sample(10)
Out[3]:
longitude latitude housing_median_age total_rooms total_bedrooms population households median_income median_house_value ocean_proximity
9593 -120.46 37.33 4.0 786.0 116.0 368.0 109.0 6.3215 138200.0 INLAND
8178 -118.11 33.80 36.0 1680.0 291.0 744.0 280.0 4.6600 244800.0 <1H OCEAN
6578 -118.18 34.20 44.0 1473.0 250.0 668.0 239.0 8.7200 415900.0 <1H OCEAN
18760 -122.25 40.60 16.0 2753.0 494.0 1414.0 459.0 3.8323 128300.0 INLAND
16446 -121.29 38.15 23.0 4183.0 633.0 1886.0 628.0 4.8787 175300.0 INLAND
12840 -121.43 38.66 35.0 1814.0 367.0 1076.0 372.0 2.7179 81100.0 INLAND
839 -122.06 37.60 17.0 5159.0 832.0 3174.0 817.0 5.8704 234400.0 NEAR BAY
5085 -118.28 33.98 39.0 1306.0 345.0 1332.0 331.0 1.9564 92200.0 <1H OCEAN
6927 -118.07 34.00 42.0 1392.0 351.0 1471.0 348.0 2.6300 143800.0 <1H OCEAN
13363 -117.64 33.99 29.0 1005.0 152.0 513.0 149.0 2.4375 181300.0 INLAND
In [4]:
housing.hist(bins=50, figsize=(20,15))
plt.show()

The data shows a high level of skweness, most of the features don't follow a normal distribution

In [5]:
housing.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20640 non-null  float64
 1   latitude            20640 non-null  float64
 2   housing_median_age  20640 non-null  float64
 3   total_rooms         20640 non-null  float64
 4   total_bedrooms      20433 non-null  float64
 5   population          20640 non-null  float64
 6   households          20640 non-null  float64
 7   median_income       20640 non-null  float64
 8   median_house_value  20640 non-null  float64
 9   ocean_proximity     20640 non-null  object 
dtypes: float64(9), object(1)
memory usage: 1.6+ MB

Here we can see that we have NULL values for the total bedrooms, we can deal with these values later.

In [6]:
housing.describe().transpose()
Out[6]:
count mean std min 25% 50% 75% max
longitude 20640.0 -119.569704 2.003532 -124.3500 -121.8000 -118.4900 -118.01000 -114.3100
latitude 20640.0 35.631861 2.135952 32.5400 33.9300 34.2600 37.71000 41.9500
housing_median_age 20640.0 28.639486 12.585558 1.0000 18.0000 29.0000 37.00000 52.0000
total_rooms 20640.0 2635.763081 2181.615252 2.0000 1447.7500 2127.0000 3148.00000 39320.0000
total_bedrooms 20433.0 537.870553 421.385070 1.0000 296.0000 435.0000 647.00000 6445.0000
population 20640.0 1425.476744 1132.462122 3.0000 787.0000 1166.0000 1725.00000 35682.0000
households 20640.0 499.539680 382.329753 1.0000 280.0000 409.0000 605.00000 6082.0000
median_income 20640.0 3.870671 1.899822 0.4999 2.5634 3.5348 4.74325 15.0001
median_house_value 20640.0 206855.816909 115395.615874 14999.0000 119600.0000 179700.0000 264725.00000 500001.0000
In [7]:
housing.plot(kind='box', 
            subplots=True, 
            sharey=False, 
            figsize=(25, 15))
plt.subplots_adjust(wspace=1) 

plt.show()

Looking at the boxplot for each of our numerical features we can see that most of them have huge difference in the values (outliers) this is of course due to the skweness of the data.

In [8]:
housing.corr(numeric_only=True)
Out[8]:
longitude latitude housing_median_age total_rooms total_bedrooms population households median_income median_house_value
longitude 1.000000 -0.924664 -0.108197 0.044568 0.069608 0.099773 0.055310 -0.015176 -0.045967
latitude -0.924664 1.000000 0.011173 -0.036100 -0.066983 -0.108785 -0.071035 -0.079809 -0.144160
housing_median_age -0.108197 0.011173 1.000000 -0.361262 -0.320451 -0.296244 -0.302916 -0.119034 0.105623
total_rooms 0.044568 -0.036100 -0.361262 1.000000 0.930380 0.857126 0.918484 0.198050 0.134153
total_bedrooms 0.069608 -0.066983 -0.320451 0.930380 1.000000 0.877747 0.979728 -0.007723 0.049686
population 0.099773 -0.108785 -0.296244 0.857126 0.877747 1.000000 0.907222 0.004834 -0.024650
households 0.055310 -0.071035 -0.302916 0.918484 0.979728 0.907222 1.000000 0.013033 0.065843
median_income -0.015176 -0.079809 -0.119034 0.198050 -0.007723 0.004834 0.013033 1.000000 0.688075
median_house_value -0.045967 -0.144160 0.105623 0.134153 0.049686 -0.024650 0.065843 0.688075 1.000000
In [9]:
corr_matrix = housing.corr(numeric_only=True)
corr_matrix["median_house_value"].sort_values(ascending=False)
Out[9]:
median_house_value    1.000000
median_income         0.688075
total_rooms           0.134153
housing_median_age    0.105623
households            0.065843
total_bedrooms        0.049686
population           -0.024650
longitude            -0.045967
latitude             -0.144160
Name: median_house_value, dtype: float64
In [10]:
attributes = ["median_income", "total_rooms", "housing_median_age", "latitude", "median_house_value" ]
pd.plotting.scatter_matrix(housing[attributes], figsize=(25,20))
plt.show()
In [11]:
plt.figure(figsize=(15, 10))
sns.heatmap(housing.corr(numeric_only=True), cmap="coolwarm", annot=True).set_title('Correlation Heatmap')
plt.show()

After looking at the correlation matrix and the heatmap of the matrix, we can see that the meadian house value is mainly affected by the median income of the family. This value has the highest impact on the house value

Another thing to note here, is the high number of correlated values, whether they are proportionate or inversely proportionate.

Data Visualization¶

In [12]:
housing_plot = housing.copy()
housing_plot["longitude"], housing_plot["latitude"] = lnglat_to_meters(housing["longitude"], housing["latitude"])
In [13]:
housing_plot.head()
Out[13]:
longitude latitude housing_median_age total_rooms total_bedrooms population households median_income median_house_value ocean_proximity
0 -1.360658e+07 4.562488e+06 41.0 880.0 129.0 322.0 126.0 8.3252 452600.0 NEAR BAY
1 -1.360547e+07 4.559667e+06 21.0 7099.0 1106.0 2401.0 1138.0 8.3014 358500.0 NEAR BAY
2 -1.360769e+07 4.558257e+06 52.0 1467.0 190.0 496.0 177.0 7.2574 352100.0 NEAR BAY
3 -1.360881e+07 4.558257e+06 52.0 1274.0 235.0 558.0 219.0 5.6431 341300.0 NEAR BAY
4 -1.360881e+07 4.558257e+06 52.0 1627.0 280.0 565.0 259.0 3.8462 342200.0 NEAR BAY
In [14]:
esri = hv.element.tiles.ESRI().redim(x="Longtitude", y="Latitude").opts(alpha=0.3, bgcolor='black', width=1000, height=700)
points =  hv.Points(housing_plot, kdims=["longitude", "latitude"]).opts(color="median_house_value", cmap="jet", colorbar=True, tools=["hover"])

esri * points
Out[14]:

The above insights give us an idication of how the median income and the ocean proximity affect the price of the house, now it's time to start data processing.

Data Processing¶

First, I'll convert the ocean proximity into a category type

In [15]:
housing["ocean_proximity"] = housing["ocean_proximity"].astype("category")

Secondly, let's fill in the NULL values using the average value in the total bedroom columns

In [16]:
housing["total_bedrooms"].fillna(housing["total_bedrooms"].mean(), inplace=True)

We can introduce more features into the dataset that might help us better predict the value of the house

In [17]:
housing["rooms_per_household"] = housing["total_rooms"] / housing["households"]
housing["bedrooms_per_room"] = housing["total_bedrooms"] / housing["total_rooms"]
housing["population_per_household"]=housing["population"] / housing["households"]
In [18]:
housing.head()
Out[18]:
longitude latitude housing_median_age total_rooms total_bedrooms population households median_income median_house_value ocean_proximity rooms_per_household bedrooms_per_room population_per_household
0 -122.23 37.88 41.0 880.0 129.0 322.0 126.0 8.3252 452600.0 NEAR BAY 6.984127 0.146591 2.555556
1 -122.22 37.86 21.0 7099.0 1106.0 2401.0 1138.0 8.3014 358500.0 NEAR BAY 6.238137 0.155797 2.109842
2 -122.24 37.85 52.0 1467.0 190.0 496.0 177.0 7.2574 352100.0 NEAR BAY 8.288136 0.129516 2.802260
3 -122.25 37.85 52.0 1274.0 235.0 558.0 219.0 5.6431 341300.0 NEAR BAY 5.817352 0.184458 2.547945
4 -122.25 37.85 52.0 1627.0 280.0 565.0 259.0 3.8462 342200.0 NEAR BAY 6.281853 0.172096 2.181467
In [19]:
housing.corr(numeric_only=True)
Out[19]:
longitude latitude housing_median_age total_rooms total_bedrooms population households median_income median_house_value rooms_per_household bedrooms_per_room population_per_household
longitude 1.000000 -0.924664 -0.108197 0.044568 0.069260 0.099773 0.055310 -0.015176 -0.045967 -0.027540 0.076231 0.002476
latitude -0.924664 1.000000 0.011173 -0.036100 -0.066658 -0.108785 -0.071035 -0.079809 -0.144160 0.106389 -0.092352 0.002366
housing_median_age -0.108197 0.011173 1.000000 -0.361262 -0.318998 -0.296244 -0.302916 -0.119034 0.105623 -0.153277 0.130709 0.013191
total_rooms 0.044568 -0.036100 -0.361262 1.000000 0.927253 0.857126 0.918484 0.198050 0.134153 0.133798 -0.180316 -0.024581
total_bedrooms 0.069260 -0.066658 -0.318998 0.927253 1.000000 0.873910 0.974725 -0.007682 0.049454 0.001535 0.069660 -0.028354
population 0.099773 -0.108785 -0.296244 0.857126 0.873910 1.000000 0.907222 0.004834 -0.024650 -0.072213 0.005063 0.069863
households 0.055310 -0.071035 -0.302916 0.918484 0.974725 0.907222 1.000000 0.013033 0.065843 -0.080598 0.027430 -0.027309
median_income -0.015176 -0.079809 -0.119034 0.198050 -0.007682 0.004834 0.013033 1.000000 0.688075 0.326895 -0.511031 0.018766
median_house_value -0.045967 -0.144160 0.105623 0.134153 0.049454 -0.024650 0.065843 0.688075 1.000000 0.151948 -0.220049 -0.023737
rooms_per_household -0.027540 0.106389 -0.153277 0.133798 0.001535 -0.072213 -0.080598 0.326895 0.151948 1.000000 -0.347501 -0.004852
bedrooms_per_room 0.076231 -0.092352 0.130709 -0.180316 0.069660 0.005063 0.027430 -0.511031 -0.220049 -0.347501 1.000000 0.002373
population_per_household 0.002476 0.002366 0.013191 -0.024581 -0.028354 0.069863 -0.027309 0.018766 -0.023737 -0.004852 0.002373 1.000000

It's time to convert the categorical data into numerical

In [20]:
housing = pd.get_dummies(housing, columns=["ocean_proximity"])

Now it's the time to scale the features before we feed them to our ML model

In [21]:
X = housing.drop("median_house_value", axis=1)
y = housing["median_house_value"]
In [22]:
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)
X = pd.DataFrame(X_scaled, columns=X.columns)
In [23]:
X.sample(10)
Out[23]:
longitude latitude housing_median_age total_rooms total_bedrooms population households median_income rooms_per_household bedrooms_per_room population_per_household ocean_proximity_<1H OCEAN ocean_proximity_INLAND ocean_proximity_ISLAND ocean_proximity_NEAR BAY ocean_proximity_NEAR OCEAN
18334 0.218127 0.521785 0.705882 0.074368 0.135475 0.038118 0.133860 0.282672 0.019453 0.073335 0.000789 0.0 0.0 0.0 1.0 0.0
2468 0.444223 0.435707 0.647059 0.036116 0.036623 0.019984 0.036343 0.170694 0.039410 0.035027 0.002038 0.0 1.0 0.0 0.0 0.0
2159 0.452191 0.449522 0.921569 0.045857 0.050900 0.020684 0.054267 0.143481 0.032659 0.039555 0.001244 0.0 1.0 0.0 0.0 0.0
18159 0.231076 0.512221 0.529412 0.063279 0.053383 0.026486 0.059201 0.413194 0.042898 0.026871 0.001556 1.0 0.0 0.0 0.0 0.0
8538 0.596614 0.143464 0.647059 0.019279 0.026847 0.020180 0.032396 0.332202 0.021212 0.053097 0.002381 1.0 0.0 0.0 0.0 0.0
10705 0.660359 0.113709 0.490196 0.051656 0.071695 0.017237 0.073837 0.142660 0.026028 0.052747 0.000548 1.0 0.0 0.0 0.0 0.0
1685 0.206175 0.579171 0.529412 0.096622 0.096369 0.046274 0.093735 0.267245 0.041191 0.034150 0.001774 0.0 0.0 0.0 1.0 0.0
13790 0.676295 0.199787 0.098039 0.108398 0.116077 0.056111 0.109357 0.204797 0.039388 0.037636 0.001866 0.0 1.0 0.0 0.0 0.0
10602 0.651394 0.120085 0.137255 0.062007 0.077747 0.031111 0.079263 0.282893 0.029814 0.046363 0.001297 1.0 0.0 0.0 0.0 0.0
5990 0.659363 0.165781 0.705882 0.087873 0.083313 0.037585 0.086992 0.371664 0.040241 0.031813 0.001484 0.0 1.0 0.0 0.0 0.0
In [24]:
y.sample(10)
Out[24]:
18621    261500.0
11388    294900.0
13541     86100.0
7157     156300.0
20630    112000.0
9421     478600.0
9431     240900.0
13116    175700.0
7387     112000.0
6832     245700.0
Name: median_house_value, dtype: float64
In [25]:
X.rename(columns={"ocean_proximity_<1H OCEAN": "ocean_proximity_1H OCEAN"}, inplace=True)
In [26]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

Prediciton¶

Now, it's time to make some predictions by creating a ML model.

1.Linear Regression Model¶

In [27]:
model = LinearRegression()
model.fit(X_train, y_train)
Out[27]:
LinearRegression()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
LinearRegression()
In [28]:
y_pred = model.predict(X_test)
rmse = mean_squared_error(y_test, y_pred, squared=False)
evs = explained_variance_score(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"Root Mean Squared Error: {rmse:.2f}, Explained Variance Score: {evs:.3f}, Coefficient of Determination: {r2:.3f}")
Root Mean Squared Error: 71761.59, Explained Variance Score: 0.614, Coefficient of Determination: 0.614

2.XGBoost Model¶

First, I'll create a model using some hyperparameter, then I'll try to do some hyperparameter tuning

2.1 XGB Model¶

In [29]:
xgmodel = XGBRegressor(n_estimators=1000, 
                       learning_rate=0.08, 
                       gamma=0, 
                       subsample=0.75,
                       colsample_bytree=1,
                       max_depth=7
                       )
xgmodel.fit(X_train, y_train)
Out[29]:
XGBRegressor(base_score=0.5, booster='gbtree', callbacks=None,
             colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
             early_stopping_rounds=None, enable_categorical=False,
             eval_metric=None, feature_types=None, gamma=0, gpu_id=-1,
             grow_policy='depthwise', importance_type=None,
             interaction_constraints='', learning_rate=0.08, max_bin=256,
             max_cat_threshold=64, max_cat_to_onehot=4, max_delta_step=0,
             max_depth=7, max_leaves=0, min_child_weight=1, missing=nan,
             monotone_constraints='()', n_estimators=1000, n_jobs=0,
             num_parallel_tree=1, predictor='auto', random_state=0, ...)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
XGBRegressor(base_score=0.5, booster='gbtree', callbacks=None,
             colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
             early_stopping_rounds=None, enable_categorical=False,
             eval_metric=None, feature_types=None, gamma=0, gpu_id=-1,
             grow_policy='depthwise', importance_type=None,
             interaction_constraints='', learning_rate=0.08, max_bin=256,
             max_cat_threshold=64, max_cat_to_onehot=4, max_delta_step=0,
             max_depth=7, max_leaves=0, min_child_weight=1, missing=nan,
             monotone_constraints='()', n_estimators=1000, n_jobs=0,
             num_parallel_tree=1, predictor='auto', random_state=0, ...)
In [30]:
y_pred = xgmodel.predict(X_test)
rmse = mean_squared_error(y_test, y_pred, squared=False)
evs = explained_variance_score(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Root Mean Squared Error: {rmse:.2f}, Explained Variance Score: {evs:.3f}, Coefficient of Determination: {r2:.3f}")
Root Mean Squared Error: 45231.57, Explained Variance Score: 0.847, Coefficient of Determination: 0.846

Here we can see the huge improvment in the accuracy of our model, There might still be room for improvment by doing some hyperparameter-tuning

2.2. Hyperparameter Tuning¶

In [166]:
params = {'max_depth': [3,6,10],
          'learning_rate': [0.01, 0.08, 0.1],
          'n_estimators': [100, 500, 1000],
          'colsample_bytree': [0.3, 0.7, 1],
          'subsample':[0.3, 0.75, 1]
         }
In [168]:
clf = GridSearchCV(estimator=xgmodel, 
                   param_grid=params,
                   scoring='neg_mean_squared_error', 
                   verbose=10)
clf.fit(X_train, y_train)
Fitting 5 folds for each of 243 candidates, totalling 1215 fits
Out[168]:
GridSearchCV(estimator=XGBRegressor(base_score=0.5, booster='gbtree',
                                    callbacks=None, colsample_bylevel=1,
                                    colsample_bynode=1, colsample_bytree=1,
                                    early_stopping_rounds=None,
                                    enable_categorical=False, eval_metric=None,
                                    feature_types=None, gamma=0, gpu_id=-1,
                                    grow_policy='depthwise',
                                    importance_type=None,
                                    interaction_constraints='',
                                    learning_rate=0.08, max_bin=256,...
                                    max_delta_step=0, max_depth=7, max_leaves=0,
                                    min_child_weight=1, missing=nan,
                                    monotone_constraints='()',
                                    n_estimators=1000, n_jobs=0,
                                    num_parallel_tree=1, predictor='auto',
                                    random_state=0, ...),
             param_grid={'colsample_bytree': [0.3, 0.7, 1], 'gamma': [0],
                         'learning_rate': [0.01, 0.08, 0.1],
                         'max_depth': [3, 6, 10],
                         'n_estimators': [100, 500, 1000],
                         'subsample': [0.3, 0.75, 1]},
             scoring='neg_mean_squared_error', verbose=1)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
GridSearchCV(estimator=XGBRegressor(base_score=0.5, booster='gbtree',
                                    callbacks=None, colsample_bylevel=1,
                                    colsample_bynode=1, colsample_bytree=1,
                                    early_stopping_rounds=None,
                                    enable_categorical=False, eval_metric=None,
                                    feature_types=None, gamma=0, gpu_id=-1,
                                    grow_policy='depthwise',
                                    importance_type=None,
                                    interaction_constraints='',
                                    learning_rate=0.08, max_bin=256,...
                                    max_delta_step=0, max_depth=7, max_leaves=0,
                                    min_child_weight=1, missing=nan,
                                    monotone_constraints='()',
                                    n_estimators=1000, n_jobs=0,
                                    num_parallel_tree=1, predictor='auto',
                                    random_state=0, ...),
             param_grid={'colsample_bytree': [0.3, 0.7, 1], 'gamma': [0],
                         'learning_rate': [0.01, 0.08, 0.1],
                         'max_depth': [3, 6, 10],
                         'n_estimators': [100, 500, 1000],
                         'subsample': [0.3, 0.75, 1]},
             scoring='neg_mean_squared_error', verbose=1)
XGBRegressor(base_score=0.5, booster='gbtree', callbacks=None,
             colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
             early_stopping_rounds=None, enable_categorical=False,
             eval_metric=None, feature_types=None, gamma=0, gpu_id=-1,
             grow_policy='depthwise', importance_type=None,
             interaction_constraints='', learning_rate=0.08, max_bin=256,
             max_cat_threshold=64, max_cat_to_onehot=4, max_delta_step=0,
             max_depth=7, max_leaves=0, min_child_weight=1, missing=nan,
             monotone_constraints='()', n_estimators=1000, n_jobs=0,
             num_parallel_tree=1, predictor='auto', random_state=0, ...)
XGBRegressor(base_score=0.5, booster='gbtree', callbacks=None,
             colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
             early_stopping_rounds=None, enable_categorical=False,
             eval_metric=None, feature_types=None, gamma=0, gpu_id=-1,
             grow_policy='depthwise', importance_type=None,
             interaction_constraints='', learning_rate=0.08, max_bin=256,
             max_cat_threshold=64, max_cat_to_onehot=4, max_delta_step=0,
             max_depth=7, max_leaves=0, min_child_weight=1, missing=nan,
             monotone_constraints='()', n_estimators=1000, n_jobs=0,
             num_parallel_tree=1, predictor='auto', random_state=0, ...)
In [169]:
print("Best parameters:", clf.best_params_)
print("Lowest RMSE: ", (-clf.best_score_)**(1/2.0)) 
Best parameters: {'colsample_bytree': 0.7, 'gamma': 0, 'learning_rate': 0.01, 'max_depth': 10, 'n_estimators': 1000, 'subsample': 0.75}
Lowest RMSE:  45441.33175988161
In [170]:
y_pred = clf.predict(X_test)
rmse = mean_squared_error(y_test, y_pred, squared=False)
evs = explained_variance_score(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Root Mean Squared Error: {rmse:.2f}, Explained Variance Score: {evs:.3f}, Coefficient of Determination: {r2:.3f}")
Root Mean Squared Error: 44408.17, Explained Variance Score: 0.852, Coefficient of Determination: 0.852

We can see that the tuning has increased the accuracy of our model a bit better. Depending on our business, we can determine if we need to try to improve the accuracy or if that enough for our business problem.

In [ ]: